Natural Language Processing (NLP) for:
Harry Potter and the Philosopher’s Stone (1997)
Harry Potter and the Chamber of Secrets (1998)
Harry Potter and the Prisoner of Azkaban (1999)
Harry Potter and the Goblet of Fire (2000)
Harry Potter and the Order of the Phoenix (2003)
Harry Potter and the Half-Blood Prince (2005)
Harry Potter and the Deathly Hallows (2007)
rm(list = ls())
gc()
# install.packages('pacman') install.packages('remotes') library(remotes)
# install.packages('devtools') library(devtools)
# remotes::install_github('nrguimaraes/sentimentSetsR') library(magrittr)
# devtools::install_github('wch/webshot') webshot::install_phantomjs()
# library(BiocManager)
# BiocManager::install('https://bioconductor.org/biocLite.R')
# source('https://bioconductor.org/biocLite.R') biocLite('EBImage')
library(pacman)
pacman::p_load(devtools, knitr, magrittr, dplyr, ggplot2, rvest, sentimentSetsR,
caret, textTinyR, text2vec, tm, tidytext, stringr, stringi, SnowballC, stopwords,
wordcloud, prettydoc, cowplot, kable, utf8, corpus, glue, topicmodels, stm, wordcloud2,
htmlwidgets, viridis)
knitr::opts_chunk$set(echo = TRUE, message = FALSE, comment = NA, warning = FALSE,
tidy = TRUE, results = "hold", cache = FALSE, dpi = 120, fig.width = 8, fig.height = 6)
# Parameters N-gram
ngrams <- "single words"
ngram <- c(1, 1)
## Number of Topics
K <- 4
# Custom functions Remove quotation marks
pasteNQ <- function(...) {
output <- paste(...)
noquote(output)
}
pasteNQ0 <- function(...) {
output <- paste0(...)
noquote(output)
}
# Proportions
propTab <- function(data, exclude = NULL, useNA = "no", dnn = NULL, deparse.level = 1,
digits = 0) {
round(table(data, exclude = exclude, useNA = useNA, dnn = dnn, deparse.level = deparse.level)/NROW(data) *
100, digits)
}
## Chart Template
Grph_theme <- function() {
palette <- brewer.pal("Greys", n = 9)
color.background = palette[2]
color.grid.major = palette[3]
color.axis.text = palette[6]
color.axis.title = palette[7]
color.title = palette[9]
theme_bw(base_size = 9) + theme(panel.background = element_rect(fill = color.background,
color = color.background)) + theme(plot.background = element_rect(fill = color.background,
color = color.background)) + theme(panel.border = element_rect(color = color.background)) +
theme(panel.grid.major = element_line(color = color.grid.major, size = 0.25)) +
theme(panel.grid.minor = element_blank()) + theme(axis.ticks = element_blank()) +
theme(legend.position = "none") + theme(legend.title = element_text(size = 11,
color = "black")) + theme(legend.background = element_rect(fill = color.background)) +
theme(legend.text = element_text(size = 9, color = "black")) + theme(strip.text.x = element_text(size = 9,
color = "black", vjust = 1)) + theme(plot.title = element_text(color = color.title,
size = 20, vjust = 1.25)) + theme(axis.text.x = element_text(size = 9, color = "black")) +
theme(axis.text.y = element_text(size = 9, color = "black")) + theme(axis.title.x = element_text(size = 10,
color = "black", vjust = 0)) + theme(axis.title.y = element_text(size = 10,
color = "black", vjust = 1.25)) + theme(plot.margin = unit(c(0.35, 0.2, 0.3,
0.35), "cm"))
}
## Clean Corpus
basicclean <- function(rawtext) {
# Set to lowercase
rawtext <- tolower(rawtext)
print(pasteNQ("Set to lowercase"))
# Remove contractions
fix_contractions <- function(rawtext) {
rawtext <- gsub("will not", "won't", rawtext)
rawtext <- gsub("can't", "can not", rawtext)
rawtext <- gsub("can not", "cannot", rawtext)
rawtext <- gsub("shant", "shall not", rawtext)
rawtext <- gsub("n't", " not", rawtext)
rawtext <- gsub("'ll", " will", rawtext)
rawtext <- gsub("'re", " are", rawtext)
rawtext <- gsub("'ve", " have", rawtext)
rawtext <- gsub("'m", " am", rawtext)
rawtext <- gsub("'d", " would", rawtext)
rawtext <- gsub("'ld", " would", rawtext)
rawtext <- gsub("'ld", " would", rawtext)
rawtext <- gsub("'s", "", rawtext)
return(rawtext)
}
rawtext <- fix_contractions(rawtext)
print(pasteNQ("Fixed contractions"))
# Strip whitespace
rawtext <- stripWhitespace(rawtext)
print(pasteNQ("Stripped whitespace"))
return(rawtext)
}
# Remove stop words
removestopwords <- function(rawtext, remove = NULL, retain = NULL) {
# Remove stop words
stopwords_custom <- stopwords::stopwords("en", source = "snowball")
stopwords_custom <- c(stopwords_custom, remove)
stopwords_retain <- retain
stopwords_custom <- stopwords_custom[!stopwords_custom %in% stopwords_retain]
rawtext <- removeWords(rawtext, stopwords_custom)
print(pasteNQ("Removed", length(stopwords_custom), "stop words"))
return(rawtext)
}
## Word Stemming
wordstem <- function(rawtext) {
# Stemming words
rawtext <- stemDocument(rawtext)
print(pasteNQ("Stemmed words"))
return(rawtext)
}
## Remove Non-Alpha
removenonalpha <- function(rawtext) {
# Remove puncutation, numbers, and other none characters
rawtext <- removePunctuation(rawtext)
rawtext <- removeNumbers(rawtext)
rawtext <- gsub("[^[:alnum:]///' ]", "", rawtext)
rawtext <- gsub("[']", "", rawtext)
print(pasteNQ("Removed punctuation, numbers, and other none characters"))
return(rawtext)
}
# Remove JavaScript from WordClouds
library("EBImage")
embed_htmlwidget <- function(widget, rasterise = T) {
outputFormat = knitr::opts_knit$get("rmarkdown.pandoc.to")
if (rasterise || outputFormat == "latex") {
html.file = tempfile("tp", fileext = ".html")
png.file = tempfile("tp", fileext = ".png")
htmlwidgets::saveWidget(widget, html.file, selfcontained = FALSE)
webshot::webshot(html.file, file = png.file, vwidth = 700, vheight = 500,
delay = 10)
img = EBImage::readImage(png.file)
EBImage::display(img)
} else {
widget
}
}setwd("C:/Users/siebe/Documents/07_Books/Harry Potter/")
titles <- c("Philosopher's Stone", "Chamber of Secrets", "Prisoner of Azkaban", "Goblet of Fire",
"Order of the Phoenix", "Half-Blood Prince", "Deathly Hallows")
html <- c("Harry_Potter_and_the_Philosophers_Stone.html", "Harry_Potter_and_the_Chamber_of_Secrets.html",
"Harry_Potter_and_the_Prisoner_of_Azkaban.html", "Harry_Potter_and_the_Goblet_of_Fire.html",
"Harry_Potter_and_the_Order_of_the_Phoenix.html", "Harry_Potter_and_the_Half-Blood_Prince.html",
"Harry_Potter_and_the_Deathly_Hallows.html")
books <- tibble(Text = as.character(), Book = as.character())
para3 <- tibble(Text = as.character(), Book = as.character())
for (i in 1:7) {
rawtext <- read_html(html[i]) %>% html_nodes(xpath = "/html/body/p") %>% html_text(trim = TRUE)
wordcount <- sapply(strsplit(rawtext, " "), length)
paragraph <- rawtext[wordcount >= 3]
# Book Level Documents
books <- rbind(books, tibble(Text = str_c(paragraph, collapse = " "), Book = titles[i]))
# Paragraph Level Documents
triplet <- do.call(rbind, lapply(seq(1, length(paragraph), by = 3), function(x) tibble(Text = str_c(paragraph[x:(x +
2)], collapse = " "), Book = titles[i])))
para3 <- rbind(para3, triplet)
}
# Page Level Documents
pages <- books %>% unnest_tokens(Text, Text, strip_punct = T, to_lower = F) %>% group_by(Book,
Page = dplyr::row_number()%/%250) %>% dplyr::summarize(Text = stringr::str_c(Text,
collapse = " ")) %>% mutate(Page = dplyr::row_number()) %>% ungroup()
# Place books in chronolgical order
books$Book <- factor(books$Book, levels = titles)
para3$Book <- factor(para3$Book, levels = titles)
pages$Book <- factor(pages$Book, levels = titles)For this sentiment analysis, I want to grab the text around certain characters. Chapters and pages are too much text as they can contain multiple story points, and sentences are too little text as they likely contain little contextual information.
Instead, I plan to take paragraphs as the documents for my corpus. However, paragraphs can be single sentences as in the case of two characters switch dialogue. Therefore, I take paragraph triplets: three paragraphs containing three or more words.
I start checking for normal (or “normal-ish”) distributions of words per paragraph triplet to make sure there is some consistency in document length. The word distributions appear to approximate a normal distribution, although with a non-trivial right tail. In addition, the word distributions are similar across books, making it a comparable level of analysis.
para3$Wordcount <- sapply(strsplit(para3$Text %>% as.character(), " "), length)
pasteNQ0("Average Amount of Words per Paragraph Triplet")
summary(para3$Wordcount)
# Graph distribution of words all
ggplot(para3, aes(Wordcount, fill = I("#7F0909"))) + geom_histogram() + stat_bin(bins = 100) +
Grph_theme() + ylab("Frequency") + xlab("Count of Words") + ggtitle("Words per Paragraph Triplet")# Graph with book fill
ggplot(para3, aes(Wordcount, fill = Book)) + geom_area(stat = "bin") + scale_fill_manual(values = c("#946B2D",
"#0D6217", "#000A90", "#AAAAAA", "#000000", "#7F0909", "#FFC500")) + Grph_theme() +
ylab("Frequency") + xlab("Count of Words") + ggtitle("Words per Paragraph Triplet") +
theme(legend.text = element_text(size = 8, color = "black")) + theme(legend.position = "bottom")[1] Average Amount of Words per Paragraph Triplet
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 52.00 77.00 88.54 111.00 530.00
I create three character dictionaries to compare against each other. The first contains our main trio, at least one of whom is mentioned in
# Create Dictionaries
trio <- c("harry", "ron", "hermione")
pasteNQ("Main Characters:")
paste(trio)
cat("\n")
heroes <- c("lily", "james", "hagrid", "dumbledore", "sirius", "lupin", "moody",
"slughorn", "dobby", "cedric", "luna", "tonks", "mcgonagall", "ginny", "order of the phoenix",
"neville")
pasteNQ("Number of Heroes:", length(heroes))
pasteNQ("Including:")
paste(heroes)
cat("\n")
villians <- c("snape", "draco", "lucius", "umbridge", "voldemort", "quirrell", "pettigrew",
"dementor", "greyback", "bellatrix", "dudley", "vernon", "petunia", "riddle",
"death eaters", "aragog", "basilisk", "nagini", "dementors")
pasteNQ("Number of Villians:", length(villians))
pasteNQ("Including:")
paste(villians)[1] Main Characters:
[1] "harry" "ron" "hermione"
[1] Number of Heroes: 16
[1] Including:
[1] "lily" "james" "hagrid"
[4] "dumbledore" "sirius" "lupin"
[7] "moody" "slughorn" "dobby"
[10] "cedric" "luna" "tonks"
[13] "mcgonagall" "ginny" "order of the phoenix"
[16] "neville"
[1] Number of Villians: 19
[1] Including:
[1] "snape" "draco" "lucius" "umbridge" "voldemort"
[6] "quirrell" "pettigrew" "dementor" "greyback" "bellatrix"
[11] "dudley" "vernon" "petunia" "riddle" "death eaters"
[16] "aragog" "basilisk" "nagini" "dementors"
# Sentiment Scores
average <- function(x) {
pos <- sum(x[x > 0], na.rm = T)
neg <- sum(x[x < 0], na.rm = T) %>% abs()
neu <- length(x[x == 0])
bal <- ((pos - neg)/(pos + neg)) * 100
y <- ifelse(is.nan(bal), 0, bal %>% as.numeric())
return(y)
}
CleanText <- basicclean(para3$Text)
para3$Score <- sapply(CleanText, function(x) getSentiment(x, dictionary = "vader",
score.type = average))
para3$Cross <- NA
para3$Cross <- ifelse(para3$Score < 0, "Negative", para3$Cross)
para3$Cross <- ifelse(para3$Score > 0, "Positive", para3$Cross)
para3$Cross <- ifelse(para3$Score == 0, "Neutral", para3$Cross)
cat("\n")
pasteNQ("Sentiment Levels")
propTab(para3$Cross)[1] Set to lowercase
[1] Fixed contractions
[1] Stripped whitespace
[1] Sentiment Levels
Negative Neutral Positive <NA>
43 10 47 0
# Subset text mentioning a main character
trio_sent <- data.frame(Trio = NA, Sentiment = NA)
# Find mean of sentiment scores for each main character and create ID of all trio
# for total row
trio_ID <- c()
for (string in trio) {
findstr <- paste0("\\b", string, "\\b")
trio_sent <- rbind(trio_sent, c(string, para3 %>% as.data.frame() %>% .[grepl(findstr,
CleanText), "Score"] %>% average() %>% as.numeric()))
trio_ID <- c(trio_ID, grep(findstr, CleanText))
}
# Sort data
trio_sent <- trio_sent %>% .[order(trio_sent$Sentiment %>% as.numeric()), ] %>% tidyr::drop_na()
max_sent <- max(trio_sent$Sentiment %>% as.numeric() %>% abs() + 1)
# Add main character total
trio_ID <- unique(trio_ID)
trio_sent <- rbind(trio_sent, c("average of trio", para3 %>% as.data.frame() %>%
.[trio_ID, "Score"] %>% average() %>% as.numeric()))
# Graph
trio_sent %>% mutate(Trio = factor(Trio, levels = Trio)) %>% ggplot(aes(x = Trio,
y = Sentiment %>% as.numeric() %>% round(1), fill = Trio)) + geom_col() + coord_flip() +
Grph_theme() + ylab("Sentiment (-100% to 100%)") + xlab("") + ggtitle("Sentiment of trio") +
ylim(max_sent * -1, max_sent) + scale_fill_manual(values = c("#FFC500", "#AAAAAA",
"#7F0909", "black")) + guides(fill = FALSE)Surprisingly, Professor Slughorn has the most positive text around him. This perhaps is because Harry for the first time enjoyed and excelled in potions with him as teacher. Additionally, Harry spends a non-trival amount of time under the Felix Felicis potion while with Professor Slughorn.
Fan favorite, Luna Lovegood, has the second most positive text around her. This is not surprising as she has a very positive-minded personality.
Professor Moody
# Subset text mentioning a hero
heroes_sent <- data.frame(Hero = NA, Sentiment = NA)
# Find mean of sentiment scores for each hero and create ID of all heroes for
# total row
heroes_ID <- c()
for (string in heroes) {
findstr <- paste0("\\b", string, "\\b")
heroes_sent <- rbind(heroes_sent, c(string, para3 %>% as.data.frame() %>% .[grepl(findstr,
CleanText), "Score"] %>% average() %>% as.numeric()))
heroes_ID <- c(heroes_ID, grep(findstr, CleanText))
}
# Sort data
heroes_sent <- heroes_sent %>% .[order(heroes_sent$Sentiment %>% as.numeric()), ] %>%
tidyr::drop_na()
max_sent <- max(heroes_sent$Sentiment %>% as.numeric() %>% abs() + 1)
# Add hero total
heroes_ID <- unique(heroes_ID)
heroes_sent <- rbind(heroes_sent, c("average of heroes", para3 %>% as.data.frame() %>%
.[heroes_ID, "Score"] %>% average() %>% as.numeric()))
# Graph
heroes_sent %>% mutate(Hero = factor(Hero, levels = Hero)) %>% ggplot(aes(x = Hero,
y = Sentiment %>% as.numeric() %>% round(1), fill = Hero)) + geom_col() + coord_flip() +
Grph_theme() + ylab("Sentiment (-100% to 100%)") + xlab("") + ggtitle("Sentiment of Heroes") +
ylim(max_sent * -1, max_sent) + scale_fill_manual(values = c(viridis(nrow(heroes_sent) -
1), "black")) + guides(fill = FALSE)It makes sense that Tom Riddle, Professor Quirrell, and perhaps even Professor Snape contain more positive text than negative text.
Professor Umbridge, surprisingly, has the most positive sentiment. As a character, she projects a “everything is fine and I’m here to help” while she seeks to torture the main trio.
I intentionally defined “demonator” and “demonators”, separately. In the plural, the contain high sentiment. However, the most interesting takeaway is that, in the singular, a demonator has more positive than negative text around it This actually makes sense as Harry must think of positive thoughts/memories to combant the villians that are the personification of depression.
Unsurprisingly, the werewolf, ancient snake, and giant spider all have extremely negative sentiment.
# Subset text mentioning a villian
villians_sent <- data.frame(Villian = NA, Sentiment = NA)
# Find mean of sentiment scores for each villian and create ID of all villians
# for total row
villians_ID <- c()
for (string in villians) {
findstr <- paste0("\\b", string, "\\b")
villians_sent <- rbind(villians_sent, c(string, para3 %>% as.data.frame() %>%
.[grepl(findstr, CleanText), "Score"] %>% average() %>% as.numeric()))
villians_ID <- c(villians_ID, grep(findstr, CleanText))
}
# Sort data
villians_sent <- villians_sent %>% .[order(villians_sent$Sentiment %>% as.numeric()),
] %>% tidyr::drop_na()
max_sent <- max(villians_sent$Sentiment %>% as.numeric() %>% abs() + 1)
# Add hero total
villians_ID <- unique(villians_ID)
villians_sent <- rbind(villians_sent, c("average of villians", para3 %>% as.data.frame() %>%
.[villians_ID, "Score"] %>% average() %>% as.numeric()))
# Graph
villians_sent %>% mutate(Villian = factor(Villian, levels = Villian)) %>% ggplot(aes(x = Villian,
y = Sentiment %>% as.numeric() %>% round(1), fill = Villian)) + geom_col() +
coord_flip() + Grph_theme() + ylab("Sentiment (-100% to 100%)") + xlab("") +
ggtitle("Sentiment of Villians") + ylim(max_sent * -1, max_sent) + scale_fill_manual(values = c(viridis(nrow(villians_sent) -
1), "black")) + guides(fill = FALSE)Here, I want to note the prevalance of the main trio, the heroes, and the villians. The main trio is mentioned in ~85% of each paragraph triplet. The heroes outside the main trio are in less than 50% of each paragraph triplet, while villians are much more rare (less than 30%).
This demonstrates that J.K. Rowling prefers to focus on the likeable characters and keeps the overall story arch, featuring the villians, to a minimum.
docs_prop <- tibble(Proportion = c(round(length(trio_ID)/nrow(para3) * 100, 1), round(length(heroes_ID)/nrow(para3) *
100, 1), round(length(villians_ID)/nrow(para3) * 100, 1)), Characters = c("Main Trio",
"Heroes", "Villians"))
colors <- c(Heroes = "#7F0909", Villians = "#0D6217", `Main Trio` = "#AAAAAA")
ggplot(docs_prop, mapping = aes(x = Characters, y = Proportion, fill = Characters,
label = Proportion)) + geom_col() + geom_text(aes(label = Proportion), vjust = -0.5) +
ggtitle("Characters as Proportion of Documents") + Grph_theme() + scale_fill_manual(values = colors) +
theme(legend.position = "right") + ylim(0, 100) + labs(x = "Books", y = "Proportion (%)",
color = "Legend")In the chart below, I show the average sentiment for the main trio, heroes, and villians as the series progresses. The main takeaways I see are:
# Main Characters Sentiment
trio_grp <- para3[trio_ID, ] %>% mutate(Negative = recode(Cross, Positive = 0, Neutral = 0,
Negative = 1), Neutral = recode(Cross, Positive = 0, Neutral = 1, Negative = 0)) %>%
group_by(Book) %>% summarize(`Trio Sentiment` = mean(Negative, na.rm = T) * 100)
# Heroes Sentiment
heroes_grp <- para3[heroes_ID, ] %>% mutate(Negative = recode(Cross, Positive = 0,
Neutral = 0, Negative = 1), Neutral = recode(Cross, Positive = 0, Neutral = 1,
Negative = 0)) %>% group_by(Book) %>% summarize(`Heroes Sentiment` = mean(Negative,
na.rm = T) * 100)
# Villians Sentiment
villians_grp <- para3[villians_ID, ] %>% mutate(Negative = recode(Cross, Positive = 0,
Neutral = 0, Negative = 1), Neutral = recode(Cross, Positive = 0, Neutral = 1,
Negative = 0)) %>% group_by(Book) %>% summarize(`Villians Sentiment` = mean(Negative,
na.rm = T) * 100)
# Plot
df <- inner_join(heroes_grp, villians_grp, by = "Book") %>% as_tibble()
df <- inner_join(trio_grp, df, by = "Book")
colors <- c(Heroes = "#7F0909", Villians = "#0D6217", `Main Trio` = "#AAAAAA")
ggplot(df, mapping = aes(x = Book, group = NA)) + geom_line(aes(y = `Trio Sentiment`,
color = "Main Trio"), size = 3) + geom_line(aes(y = `Heroes Sentiment`, color = "Heroes"),
size = 1.5) + geom_line(aes(y = `Villians Sentiment`, color = "Villians"), size = 1.5) +
ggtitle("Sentiment of Heroes/Villians Across Books") + Grph_theme() + scale_color_manual(values = colors) +
theme(axis.text.x = element_text(size = 8, angle = 45)) + theme(legend.position = "right") +
ylim(0, 60) + labs(x = "Books", y = "Negative Sentiment (%)", color = "Legend")At the beginning of this document, I defined page the level as containing 250 words.
# Sentiment Scores
average <- function(x) {
pos <- sum(x[x > 0], na.rm = T)
neg <- sum(x[x < 0], na.rm = T) %>% abs()
neu <- length(x[x == 0])
bal <- ((pos - neg)/(pos + neg)) * 100
y <- ifelse(is.nan(bal), 0, bal %>% as.numeric())
return(y)
}
CleanText <- basicclean(pages$Text)
pages$Score <- sapply(CleanText, function(x) getSentiment(x, dictionary = "vader",
score.type = average))
pages$Cross <- NA
pages$Cross <- ifelse(pages$Score < 0, "Negative", pages$Cross)
pages$Cross <- ifelse(pages$Score > 0, "Positive", pages$Cross)
pages$Cross <- ifelse(pages$Score == 0, "Neutral", pages$Cross)
cat("/n")
pasteNQ("Sentiment Levels")
propTab(pages$Cross)[1] Set to lowercase
[1] Fixed contractions
[1] Stripped whitespace
/n[1] Sentiment Levels
Negative Neutral Positive
46 0 54
Finally, we can view sentiment per page to see the rollercoaster of emotions throughout the books. The main takeaways that I see are:
# Grouping by pages for the x-axis
pages %>% group_by(Book) %>% mutate(Paragraph = row_number()) %>% ggplot(aes(Page,
Score, fill = Book)) + Grph_theme() + geom_bar(alpha = 0.5, stat = "identity",
show.legend = FALSE) + scale_fill_manual(values = c("#946B2D", "#0D6217", "#000A90",
"#AAAAAA", "#000000", "#7F0909", "#FFC500")) + facet_wrap(~Book, ncol = 2, scales = "free_x")